TRAIN AND TEST DATA

# Load train data
train <- pseq(subset = "train")
print(train)
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 5748 taxa and 3615 samples ]
sample_data() Sample Data:       [ 3615 samples by 12 sample variables ]
tax_table()   Taxonomy Table:    [ 5748 taxa by 7 taxonomic ranks ]
test <- pseq(subset = "test")
print(test)
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 5748 taxa and 1809 samples ]
sample_data() Sample Data:       [ 1809 samples by 12 sample variables ]
tax_table()   Taxonomy Table:    [ 5748 taxa by 7 taxonomic ranks ]

EXPLORING SYNTHETIC CLINICAL DATA

# Check clinical variables
pheno <- data.frame(sample_data(train))
pheno <- pheno[complete.cases(pheno), ] # remove NA's
pheno[, c(3:8, 12)] <- apply(pheno[, c(3:8, 12)], 2, function(x) ifelse(x == 1, "yes", "no"))
tab <- createTable(compareGroups(Event ~ ., data = pheno))
export2md(tab)
Summary descriptives table by groups of `Event’
no yes p.overall
N=3242 N=298
Age 49.0 (14.8) 55.5 (13.8) <0.001
BodyMassIndex 27.0 (4.68) 28.0 (5.11) 0.002
Smoking: 0.039
no 2466 (76.1%) 243 (81.5%)
yes 776 (23.9%) 55 (18.5%)
BPTreatment: 0.307
no 2744 (84.6%) 245 (82.2%)
yes 498 (15.4%) 53 (17.8%)
PrevalentDiabetes: 0.806
no 3042 (93.8%) 278 (93.3%)
yes 200 (6.17%) 20 (6.71%)
PrevalentCHD: 0.024
no 3142 (96.9%) 281 (94.3%)
yes 100 (3.08%) 17 (5.70%)
PrevalentHFAIL: 0.926
no 3157 (97.4%) 291 (97.7%)
yes 85 (2.62%) 7 (2.35%)
Event_time 13.9 (5.62) 12.5 (6.13) <0.001
SystolicBP 136 (21.8) 141 (24.7) 0.001
NonHDLcholesterol 4.07 (1.08) 4.19 (1.23) 0.122
Sex: 0.604
no 1808 (55.8%) 161 (54.0%)
yes 1434 (44.2%) 137 (46.0%)
# View table (without NA's)
DT::datatable(pheno)
# Remove artifacts
pheno2 <- pheno[-which(pheno$PrevalentHFAIL == "yes" & 
                     pheno$Event == "yes"), ]
# Plot Event time distribution
hist(pheno$Event_time[which(pheno$Event == "yes")], 10)

hist(pheno$Event_time[which(pheno$Event == "no")], 10)

# Plot Age
hist(pheno$Age[which(pheno$Event == "yes")], 10)

hist(pheno$Age[which(pheno$Event == "no")], 10)

# Plot BMI
hist(pheno$BodyMassIndex[which(pheno$Event == "yes")], 10)

hist(pheno$BodyMassIndex[which(pheno$Event == "no")], 10)

MICROBIOME PREPROCESSING PIPELINE

# Load data
train <- pseq(subset = "train")
test <- pseq(subset = "test")

# Remove samples
train <- remove_samples(train,
                        remove_nas = TRUE,
                        remove_neg = FALSE)
[1] "Samples with NA's in time or event have been removed!"
# Agglomerate by Species
train <- tax_glom(train, taxrank =  "Species")
train <- subset_taxa(train, Species != "s__")
test <- tax_glom(test, taxrank = "Species")
test <- subset_taxa(test, Species != "s__")

# Calculate richness
richness_train <- estimate_richness(
                        train,
                        split = TRUE,
                        measures = c("Shannon", "Observed",
                                     "Chao1", "Simpson",
                                     "InvSimpson", "Fisher"))

richness_test <- estimate_richness(
                        test,
                        split = TRUE,
                        measures = c("Shannon", "Observed",
                                     "Chao1", "Simpson",
                                     "InvSimpson", "Fisher"))

# Filter taxa by counts
train <- filter_taxa(train,
                     function(x) sum(x > 2) > (0.8 * length(x)), TRUE)

# Calculate co-abundances
coab_taxa <- co_abundances(train)
print(table(coab_taxa$clusters))

 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
17  2 14 18  3 20 13 21  2  7  8 13  2  5 21  6 12  2 19  6 19 10  2  7 
DT::datatable(coab_taxa)
# Caculate GSVA
scores <- get_scores(coab_taxa, train, test, method = "gsva")
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels

  |                                                                                      
  |                                                                                |   0%
  |                                                                                      
  |===                                                                             |   4%
  |                                                                                      
  |=======                                                                         |   8%
  |                                                                                      
  |==========                                                                      |  12%
  |                                                                                      
  |=============                                                                   |  17%
  |                                                                                      
  |=================                                                               |  21%
  |                                                                                      
  |====================                                                            |  25%
  |                                                                                      
  |=======================                                                         |  29%
  |                                                                                      
  |===========================                                                     |  33%
  |                                                                                      
  |==============================                                                  |  38%
  |                                                                                      
  |=================================                                               |  42%
  |                                                                                      
  |=====================================                                           |  46%
  |                                                                                      
  |========================================                                        |  50%
  |                                                                                      
  |===========================================                                     |  54%
  |                                                                                      
  |===============================================                                 |  58%
  |                                                                                      
  |==================================================                              |  62%
  |                                                                                      
  |=====================================================                           |  67%
  |                                                                                      
  |=========================================================                       |  71%
  |                                                                                      
  |============================================================                    |  75%
  |                                                                                      
  |===============================================================                 |  79%
  |                                                                                      
  |===================================================================             |  83%
  |                                                                                      
  |======================================================================          |  88%
  |                                                                                      
  |=========================================================================       |  92%
  |                                                                                      
  |=============================================================================   |  96%
  |                                                                                      
  |================================================================================| 100%

Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels

  |                                                                                      
  |                                                                                |   0%
  |                                                                                      
  |===                                                                             |   4%
  |                                                                                      
  |=======                                                                         |   8%
  |                                                                                      
  |==========                                                                      |  12%
  |                                                                                      
  |=============                                                                   |  17%
  |                                                                                      
  |=================                                                               |  21%
  |                                                                                      
  |====================                                                            |  25%
  |                                                                                      
  |=======================                                                         |  29%
  |                                                                                      
  |===========================                                                     |  33%
  |                                                                                      
  |==============================                                                  |  38%
  |                                                                                      
  |=================================                                               |  42%
  |                                                                                      
  |=====================================                                           |  46%
  |                                                                                      
  |========================================                                        |  50%
  |                                                                                      
  |===========================================                                     |  54%
  |                                                                                      
  |===============================================                                 |  58%
  |                                                                                      
  |==================================================                              |  62%
  |                                                                                      
  |=====================================================                           |  67%
  |                                                                                      
  |=========================================================                       |  71%
  |                                                                                      
  |============================================================                    |  75%
  |                                                                                      
  |===============================================================                 |  79%
  |                                                                                      
  |===================================================================             |  83%
  |                                                                                      
  |======================================================================          |  88%
  |                                                                                      
  |=========================================================================       |  92%
  |                                                                                      
  |=============================================================================   |  96%
  |                                                                                      
  |================================================================================| 100%
DT::datatable(scores$train)
DT::datatable(scores$test)
# Create train and test data
pheno_train <- sample_data(train)
x_train <- data.frame(pheno_train[, -c(8, 9)],
                      richness_train,
                      scores$train)
y_train <- pheno_train[, c("Event_time", "Event")]

pheno_test <- sample_data(test)
x_test <- data.frame(pheno_test[, -c(8, 9)],
                     richness_test,
                     scores$test)
y_test <- pheno_test[, c("Event_time", "Event")]

PREPARE DATA TO TRAIN

# Creating train and test data
train <- cbind.data.frame(x_train, y_train)
test  <- cbind.data.frame(x_test, y_test)

# What do we do with ...
# ======
# Negatives survival values in train and test?
train <- train[-which(train$Event_time < 0), ]
test$Event_time[which(test$Event_time < 0)] <- 15

# NA's values in train and test?
train <- train[complete.cases(train), ]
test <- missRanger(test, pmm.k = 10, seed = 153)

Missing value imputation by random forests

  Variables to impute:      Smoking, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Event_time, Event
  Variables used to impute: Age, BodyMassIndex, Smoking, BPTreatment, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Sex, Observed, Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, cluster_20, cluster_21, cluster_22, cluster_23, cluster_24, cluster_25, Event_time, Event
iter 1: ........
iter 2: ........
iter 3: ........
iter 4: ........
iter 5: ........
# Removing PrevalentHFAIL (only 0)
train <- train[, -grep("PrevalentHFAIL", colnames(train))]
test <- test[, -grep("PrevalentHFAIL", colnames(test))]

DT::datatable(train)
Warning in instance$preRenderHook(instance): It seems your data is too big for client-
side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/
server.html

FIT THE MODEL

# Fit model
# ========
cvrts <- c("Age", "BodyMassIndex",
           "SystolicBP", "NonHDLcholesterol",
           "Sex")

(biomarkers <- setdiff(colnames(train), c(cvrts, colnames(y_train))))
 [1] "Smoking"           "BPTreatment"       "PrevalentDiabetes" "PrevalentCHD"     
 [5] "Observed"          "Chao1"             "se.chao1"          "Shannon"          
 [9] "Simpson"           "InvSimpson"        "Fisher"            "cluster_1"        
[13] "cluster_2"         "cluster_3"         "cluster_4"         "cluster_5"        
[17] "cluster_6"         "cluster_7"         "cluster_8"         "cluster_9"        
[21] "cluster_10"        "cluster_11"        "cluster_12"        "cluster_13"       
[25] "cluster_14"        "cluster_15"        "cluster_16"        "cluster_17"       
[29] "cluster_18"        "cluster_19"        "cluster_20"        "cluster_21"       
[33] "cluster_22"        "cluster_23"        "cluster_24"        "cluster_25"       
models <- fit_biospear(data = train,
                       biomarkers = biomarkers,
                       surv = c("Event_time", "Event"),
                       cvrts = cvrts,
                       inter = FALSE,
                       methods = "lasso")

Computing selection with method: lasso
print(models)
$lasso
              Age     BodyMassIndex        SystolicBP NonHDLcholesterol               Sex 
      0.044578010       0.012260426       0.002131799      -0.095917491       0.033412726 
     PrevalentCHD        InvSimpson         cluster_5 
      0.055240331      -0.017648201       0.106224418 

PREDICT IN TEST

environment(predRes2) <- asNamespace("biospear")
assignInNamespace("predRes", predRes2, ns = "biospear")
prediction <- predRes2(res = models,
                    method = "lasso",
                    traindata = train,
                    newdata = test,
                    int.cv = FALSE,
                    int.cv.nfold = 5,
                    time = seq(1, 16, 1),
                    trace = TRUE,
                    ncores = 20)

Computing prediction criteria for: training set

Computing prediction criteria for: validation set
t_train <- train$Event_time
e_train <- train$Event

t_test <- test$Event_time
e_test <- test$Event

m <- names(models)
lapply(m, function(i) {
    s_train <- prediction$scores_train[, i]
    s_test <- prediction$scores_extval[, i]
    # Concordance
    c_train <- rcorr.cens(exp(-s_train), Surv(t_train, e_train), outx = FALSE)
    c_test <- rcorr.cens(exp(-s_test), Surv(t_test, e_test), outx = FALSE)
    # Print C-Index
    print(paste0("C-Index in train set ", i, " is: ", c_train[1]))
    print(paste0("C-Index in test set ", i, " is: ", c_test[1]))
})
[1] "C-Index in train set lasso is: 0.719786125733815"
[1] "C-Index in test set lasso is: 0.71177703945034"
[[1]]
[1] "C-Index in test set lasso is: 0.71177703945034"

SAVE OUTPUT

# Save scores
scores <- data.frame(
    SampleID = rownames(test),
    Score = prediction$scores_extval[, 1],
    time = t_test,
    event = e_test,
    age = test$Age
)
print(hist(scores$Score))

$breaks
 [1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2

$counts
 [1]  33  90 142 140 151 153 140 167 155 178 164 136  96  55   8   1

$density
 [1] 0.091210614 0.248756219 0.392482034 0.386954118 0.417357656 0.422885572 0.386954118
 [8] 0.461580984 0.428413488 0.491984522 0.453289110 0.375898286 0.265339967 0.152017689
[15] 0.022111664 0.002763958

$mids
 [1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7 3.9 4.1

$xname
[1] "scores$Score"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
DT::datatable(scores)
print(plot(scores$Score, scores$age))

NULL